{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# Inference sentence-transformer Model with ONNX Runtime on CPU\n",
    "inference `0_Transformer` of sentence-transformer, not the entire model\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "# Here we use paraphrase-multilingual-MiniLM-L12-v2 for demo.\n",
    "big_model_path = \"../models/paraphrase-multilingual-MiniLM-L12-v2\"\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# 1. Load Pretrained Bert model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "../models/paraphrase-multilingual-MiniLM-L12-v2/0_Transformer\n"
     ]
    }
   ],
   "source": [
    "modules_json_path = os.path.join(big_model_path, 'modules.json')\n",
    "with open(modules_json_path) as fIn:\n",
    "    modules_config = json.load(fIn)\n",
    "\n",
    "tf_from_s_path = os.path.join(big_model_path, modules_config[0].get('path'))\n",
    "print(tf_from_s_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "Specify some model configuration variables and constant."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "\n",
    "max_seq_length = 128\n",
    "doc_stride = 128\n",
    "max_query_length = 64\n",
    "\n",
    "# Enable overwrite to export onnx model and download latest script each time when running this notebook.\n",
    "enable_overwrite = True\n",
    "\n",
    "# Total samples to inference. It shall be large enough to get stable latency measurement.\n",
    "total_samples = 1000\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "cache_dir = os.path.join(\".\", \"cache_models\")\n",
    "cache_dir\n",
    "if not os.path.exists(cache_dir):\n",
    "    os.makedirs(cache_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# # Load pretrained model and tokenizer\n",
    "\n",
    "from transformers import (AutoConfig, AutoModel, AutoTokenizer)\n",
    "\n",
    "# Load pretrained model and tokenizer\n",
    "config_class, model_class, tokenizer_class = (AutoConfig, AutoModel, AutoTokenizer)\n",
    "\n",
    "config = config_class.from_pretrained(tf_from_s_path, cache_dir=cache_dir)\n",
    "tokenizer = tokenizer_class.from_pretrained(tf_from_s_path, do_lower_case=True, cache_dir=cache_dir)\n",
    "model = model_class.from_pretrained(tf_from_s_path, from_tf=False, config=config, cache_dir=cache_dir)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': tensor([[    0, 73014,  1322,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Get the first example data to run the model and export it to ONNX\n",
    "\n",
    "st = ['您好']\n",
    "inputs = tokenizer(\n",
    "    st,\n",
    "    padding=True,\n",
    "    truncation=True,\n",
    "    max_length=512,\n",
    "    return_tensors=\"pt\"\n",
    ")\n",
    "inputs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# 2. Export the loaded model\n",
    "Once the model is loaded, we can export the loaded PyTorch model to ONNX."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model exported at  ../onnx_models/Multilingual_MiniLM_L12.onnx\n"
     ]
    }
   ],
   "source": [
    "output_dir = os.path.join(\"..\", \"onnx_models\")\n",
    "if not os.path.exists(output_dir):\n",
    "    os.makedirs(output_dir)\n",
    "export_model_path = os.path.join(output_dir, 'Multilingual_MiniLM_L12.onnx')\n",
    "\n",
    "import torch\n",
    "device = torch.device(\"cpu\")\n",
    "\n",
    "\n",
    "\n",
    "# Set model to inference mode, which is required before exporting the model because some operators behave differently in\n",
    "# inference and training mode.\n",
    "model.eval()\n",
    "model.to(device)\n",
    "\n",
    "if enable_overwrite or not os.path.exists(export_model_path):\n",
    "    with torch.no_grad():\n",
    "        symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}\n",
    "        torch.onnx.export(model,                                            # model being run\n",
    "                          args=tuple(inputs.values()),                      # model input (or a tuple for multiple inputs)\n",
    "                          f=export_model_path,                              # where to save the model (can be a file or file-like object)\n",
    "                          opset_version=11,                                 # the ONNX version to export the model to\n",
    "                          do_constant_folding=True,                         # whether to execute constant folding for optimization\n",
    "                          input_names=['input_ids',                         # the model's input names\n",
    "                                       'attention_mask',\n",
    "                                       'token_type_ids'],\n",
    "                          output_names=['start', 'end'],                    # the model's output names\n",
    "                          dynamic_axes={'input_ids': symbolic_names,        # variable length axes\n",
    "                                        'attention_mask' : symbolic_names,\n",
    "                                        'token_type_ids' : symbolic_names,\n",
    "                                        'start' : symbolic_names,\n",
    "                                        'end' : symbolic_names})\n",
    "        print(\"Model exported at \", export_model_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# 3. PyTorch Inference\n",
    "Use PyTorch to evaluate an example input for comparison purpose."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1000/1000 [00:16<00:00, 61.52it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PyTorch cpu Inference time = 16.09 ms\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "from tqdm import tqdm\n",
    "# Measure the latency. It is not accurate using Jupyter Notebook, it is recommended to use standalone python script.\n",
    "latency = []\n",
    "with torch.no_grad():\n",
    "    for i in tqdm(range(total_samples)):\n",
    "        # data = dataset[i]\n",
    "        # inputs = {\n",
    "        #     'input_ids':      data[0].to(device).reshape(1, max_seq_length),\n",
    "        #     'attention_mask': data[1].to(device).reshape(1, max_seq_length),\n",
    "        #     'token_type_ids': data[2].to(device).reshape(1, max_seq_length)\n",
    "        # }\n",
    "        start = time.time()\n",
    "        outputs = model(**inputs)\n",
    "        latency.append(time.time() - start)\n",
    "print(\"PyTorch {} Inference time = {} ms\".format(device.type, format(sum(latency) * 1000 / len(latency), '.2f')))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# 4. Inference ONNX Model with ONNX Runtime\n",
    "For Onnx Runtime 1.6.0 or older, OpenMP environment variables are very important for CPU inference of Bert model. Since 1.7.0, the official package is not built with OpenMP.\n",
    "\n",
    "Now we inference the model with ONNX Runtime. Here we can see that OnnxRuntime has better performance than PyTorch."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-02-24 11:21:16.715169653 [W:onnxruntime:, inference_session.cc:1407 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.\n",
      "100%|██████████| 2000/2000 [00:14<00:00, 137.57it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "OnnxRuntime cpu Inference time = 7.13 ms\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "\n",
    "import onnxruntime\n",
    "import numpy\n",
    "\n",
    "sess_options = onnxruntime.SessionOptions()\n",
    "\n",
    "# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.\n",
    "# Note that this will increase session creation time, so it is for debugging only.\n",
    "sess_options.optimized_model_filepath = os.path.join(output_dir, \"Multilingual-MiniLM-L12_optimized_model_cpu.onnx\")\n",
    "\n",
    "# For OnnxRuntime 1.7.0 or later, you can set intra_op_num_threads to set thread number like\n",
    "#    sess_options.intra_op_num_threads=4\n",
    "# Here we use the default value which is a good choice in most cases.\n",
    "\n",
    "# Specify providers when you use onnxruntime-gpu for CPU inference.\n",
    "session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=['CPUExecutionProvider'])\n",
    "\n",
    "latency = []\n",
    "for i in tqdm(range(2000)):\n",
    "    # data = dataset[i]\n",
    "    ort_inputs = {k:v.cpu().numpy() for k, v in inputs.items()}\n",
    "    # ort_inputs = {\n",
    "    #     'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),\n",
    "    #     'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),\n",
    "    #     'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()\n",
    "    # }\n",
    "    start = time.time()\n",
    "    ort_outputs = session.run(None, ort_inputs)\n",
    "    latency.append(time.time() - start)\n",
    "print(\"OnnxRuntime cpu Inference time = {} ms\".format(format(sum(latency) * 1000 / len(latency), '.2f')))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "# 5. Verifying"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "***** Verifying correctness *****\n",
      "PyTorch and ONNX Runtime output 0 are close: True\n"
     ]
    }
   ],
   "source": [
    "print(\"***** Verifying correctness *****\")\n",
    "for i in range(1):\n",
    "    print('PyTorch and ONNX Runtime output {} are close:'.format(i), numpy.allclose(ort_outputs[i], outputs[i].cpu(), rtol=1e-05, atol=1e-05))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}